home *** CD-ROM | disk | FTP | other *** search
Wrap
/* Copyright (c) 1994 Burra Gopal, Udi Manber. All Rights Reserved. */ #include "agrep.h" extern int checksg(); extern int D; extern FILE *debug; /* All borrowed from agrep.c and are needed for searching the index */ extern ParseTree aterminals[MAXNUM_PAT]; extern int AComplexBoolean; /* returns where it found the distinguishing token: until that from prev value of begin is the current pattern (not just the "words" in it) */ CHAR * aparse_flat(begin, end, prev, next) CHAR *begin; CHAR *end; int prev; int *next; { if (begin > end) { *next = prev; return end; } if (prev & ENDSUB_EXP) prev &= ~ATTR_EXP; if ((prev & ATTR_EXP) && !(prev & VAL_EXP)) prev |= VAL_EXP; while (begin <= end) { if (*begin == ',') { prev |= OR_EXP; prev |= VAL_EXP; prev |= ENDSUB_EXP; if (prev & AND_EXP) { fprintf(stderr, "parse error at character '%c'\n", *begin); return NULL; } *next = prev; return begin; } else if (*begin == ';') { prev |= AND_EXP; prev |= VAL_EXP; prev |= ENDSUB_EXP; if (prev & OR_EXP) { fprintf(stderr, "parse error at character '%c'\n", *begin); return NULL; } *next = prev; return begin; } else if (*begin == '\\') begin ++; /* skip two things */ begin++; } *next = prev; return begin; } int asplit_pattern_flat(APattern, AM, terminals, pnum_terminals, pAParse) CHAR *APattern; int AM; ParseTree terminals[MAXNUM_PAT]; int *pnum_terminals; int *pAParse; { CHAR *buffer; CHAR *buffer_pat; CHAR *buffer_end; buffer = APattern; buffer_end = buffer + AM; *pAParse = 0; /* * buffer is the runnning pointer, buffer_pat is the place where * the distinguishing delimiter was found, buffer_end is the end. */ while (buffer_pat = aparse_flat(buffer, buffer_end, *pAParse, pAParse)) { /* there is no pattern until after the distinguishing delimiter position: some agrep garbage */ if (buffer_pat <= buffer) { buffer = buffer_pat+1; if (buffer_pat >= buffer_end) break; continue; } if (*pnum_terminals >= MAXNUM_PAT) { fprintf(stderr, "boolean expression has too many terms\n"); return -1; } terminals[*pnum_terminals].op = 0; terminals[*pnum_terminals].type = LEAF; terminals[*pnum_terminals].terminalindex = *pnum_terminals; terminals[*pnum_terminals].data.leaf.attribute = 0; /* default is no structure */ terminals[*pnum_terminals].data.leaf.value = (CHAR *)malloc(buffer_pat - buffer + 2); memcpy(terminals[*pnum_terminals].data.leaf.value, buffer, buffer_pat - buffer); /* without distinguishing delimiter */ terminals[*pnum_terminals].data.leaf.value[buffer_pat - buffer] = '\0'; (*pnum_terminals)++; if (buffer_pat >= buffer_end) break; buffer = buffer_pat+1; } if (buffer_pat == NULL) return -1; /* got out of while loop because of NULL rather than break */ return(*pnum_terminals); } int is_complex_boolean(buffer, len) CHAR *buffer; int len; { int i = 0; CHAR cur = '\0'; while (i < len) { if (buffer[i] == '\\') i+=2; else if (buffer[i] == ',') { if ((cur == ';') || (cur == '~')) return 1; else cur = ','; i++; } else if (buffer[i] == ';') { if ((cur == ',') || (cur == '~')) return 1; else cur = ';'; i++; } /* else if ((buffer[i] == '~') || (buffer[i] == '{') || (buffer[i] == '}')) { */ else if (buffer[i] == '~') { /* even if pattern has just ~s... user must use -v option for single NOT */ return 1; } else i++; } return 0; } /* The possible tokens are: ; , a e ~ { } */ int get_token_bool(buffer, len, ptr, tokenbuf, tokenlen) CHAR *buffer, *tokenbuf; int len, *ptr, *tokenlen; { if ((*ptr>=len) || (buffer[*ptr] == '\n') || (buffer[*ptr] == '\0')) return 'e'; while ((*ptr<len) && (buffer[*ptr] != '\n') && (buffer[*ptr] != '\0') && ((buffer[*ptr] == ' ') || (buffer[*ptr] == '\t'))) (*ptr)++; if ((*ptr>=len) || (buffer[*ptr] == '\n') || (buffer[*ptr] == '\0')) return 'e'; if ((buffer[*ptr] == ',') || (buffer[*ptr] == ';') || (buffer[*ptr] == '~') || (buffer[*ptr] == '{') || (buffer[*ptr] == '}')) { tokenbuf[0] = buffer[*ptr]; *tokenlen = 1; return buffer[(*ptr)++]; } *tokenlen = 0; if (buffer[*ptr] == '\\') { tokenbuf[(*tokenlen)++] = buffer[(*ptr)++]; tokenbuf[(*tokenlen)++] = buffer[(*ptr)++]; } else tokenbuf[(*tokenlen)++] = buffer[(*ptr)++]; while ( (*ptr<len) && (buffer[*ptr] != '\n') && (buffer[*ptr] != '\0') && (buffer[*ptr] != ',') && (buffer[*ptr] != ';') && (buffer[*ptr] != '~') && (buffer[*ptr] != '{') && (buffer[*ptr] != '}')) { if (buffer[*ptr] == '\\') { tokenbuf[(*tokenlen)++] = buffer[(*ptr)++]; tokenbuf[(*tokenlen)++] = buffer[(*ptr)++]; } else tokenbuf[(*tokenlen)++] = buffer[(*ptr)++]; } tokenbuf[*tokenlen] = '\0'; return 'a'; /* It will return next 'e' next time if we broke out of the loop because (*ptr >= len) */ } print_tree(t, level) ParseTree *t; { int i; if (t == NULL) printf("NULL"); else if (t->type == LEAF) { for (i=0; i<level; i++) printf(" "); printf("LEAF %x %x %s\n", t->op, t->terminalindex, t->data.leaf.value); } else if (t->type == INTERNAL) { if (t->data.internal.left != NULL) print_tree(t->data.internal.left, level + 1); for (i=0; i<level; i++) printf(" "); printf("INTERNAL %x\n", t->op); if (t->data.internal.right != NULL) print_tree(t->data.internal.right, level + 1); } } destroy_tree(t) ParseTree *t; { if (t == NULL) return; if (t->type == LEAF) { free(t->data.leaf.value); /* t itself should not be freed: static allocation */ } else if (t->type == INTERNAL) { if (t->data.internal.left != NULL) destroy_tree(t->data.internal.left); if (t->data.internal.right != NULL) destroy_tree(t->data.internal.right); free(t); } } /* * Recursive descent; C-style => AND + OR have equal priority => must bracketize expressions appropriately or will go left->right. * Grammar: * E = {E} | ~a | ~{E} | E ; E | E , E | a * Parser: * One look ahead at each literal will tell you what to do. * ~ has highest priority, ; and , have equal priority (left to right associativity), ~~ is not allowed. */ ParseTree * aparse_tree(buffer, len, bufptr, terminals, pnum_terminals) CHAR *buffer; int len; int *bufptr; ParseTree terminals[]; int *pnum_terminals; { int token, tokenlen; CHAR tokenbuf[MAXNAME]; int oldtokenlen; CHAR oldtokenbuf[MAXNAME]; ParseTree *t, *n, *leftn; token = get_token_bool(buffer, len, bufptr, tokenbuf, &tokenlen); switch(token) { case '{': /* (exp) */ if ((t = aparse_tree(buffer, len, bufptr, terminals, pnum_terminals)) == NULL) return NULL; if ((token = get_token_bool(buffer, len, bufptr, tokenbuf, &tokenlen)) != '}') { fprintf(stderr, "parse error at offset %d\n", *bufptr); destroy_tree(t); return (NULL); } if ((token = get_token_bool(buffer, len, bufptr, tokenbuf, &tokenlen)) == 'e') return t; switch(token) { /* must find boolean infix operator */ case ',': case ';': leftn = t; if ((t = aparse_tree(buffer, len, bufptr, terminals, pnum_terminals)) == NULL) return NULL; n = (ParseTree *)malloc(sizeof(ParseTree)); n->op = (token == ';') ? ANDPAT : ORPAT ; n->type = INTERNAL; n->data.internal.left = leftn; n->data.internal.right = t; return n; /* or end of parent sub expression */ case '}': unget_token_bool(bufptr, tokenlen); /* part of someone else who called me */ return t; default: destroy_tree(t); fprintf(stderr, "parse error at offset %d\n", *bufptr); return NULL; } /* Go one level deeper */ case '~': /* not exp */ if ((token = get_token_bool(buffer, len, bufptr, tokenbuf, &tokenlen)) == 'e') return NULL; switch(token) { case 'a': if (*pnum_terminals >= MAXNUM_PAT) { fprintf(stderr, "Pattern expression too large (> %d)\n", MAXNUM_PAT); return NULL; } n = &terminals[*pnum_terminals]; n->op = 0; n->type = LEAF; n->terminalindex = (*pnum_terminals); n->data.leaf.attribute = 0; n->data.leaf.value = (unsigned char*)malloc(tokenlen + 2); memcpy(n->data.leaf.value, tokenbuf, tokenlen); n->data.leaf.value[tokenlen] = '\0'; (*pnum_terminals)++; n->op |= NOTPAT; t = n; break; case '{': if ((t = aparse_tree(buffer, len, bufptr, terminals, pnum_terminals)) == NULL) return NULL; if (t->op & NOTPAT) t->op &= ~NOTPAT; else t->op |= NOTPAT; if ((token = get_token_bool(buffer, len, bufptr, tokenbuf, &tokenlen)) != '}') { fprintf(stderr, "parse error at offset %d\n", *bufptr); destroy_tree(t); return NULL; } break; default: fprintf(stderr, "parse error at offset %d\n", *bufptr); return NULL; } /* The resulting tree is in t. Now do another lookahead at this level */ if ((token = get_token_bool(buffer, len, bufptr, tokenbuf, &tokenlen)) == 'e') return t; switch(token) { /* must find boolean infix operator */ case ',': case ';': leftn = t; if ((t = aparse_tree(buffer, len, bufptr, terminals, pnum_terminals)) == NULL) return NULL; n = (ParseTree *)malloc(sizeof(ParseTree)); n->op = (token == ';') ? ANDPAT : ORPAT ; n->type = INTERNAL; n->data.internal.left = leftn; n->data.internal.right = t; return n; case '}': unget_token_bool(bufptr, tokenlen); return t; default: destroy_tree(t); fprintf(stderr, "parse error at offset %d\n", *bufptr); return NULL; } case 'a': /* individual term (attr=val) */ if (tokenlen == 0) return NULL; memcpy(oldtokenbuf, tokenbuf, tokenlen); oldtokenlen = tokenlen; token = get_token_bool(buffer, len, bufptr, tokenbuf, &tokenlen); switch(token) { case '}': /* part of case '{' above: else syntax error not detected but semantics ok */ unget_token_bool(bufptr, tokenlen); case 'e': /* endof input */ case ',': case ';': if (*pnum_terminals >= MAXNUM_PAT) { fprintf(stderr, "Pattern expression too large (> %d)\n", MAXNUM_PAT); return NULL; } n = &terminals[*pnum_terminals]; n->op = 0; n->type = LEAF; n->terminalindex = (*pnum_terminals); n->data.leaf.attribute = 0; n->data.leaf.value = (unsigned char*)malloc(oldtokenlen + 2); memcpy(n->data.leaf.value, oldtokenbuf, oldtokenlen); n->data.leaf.value[oldtokenlen] = '\0'; (*pnum_terminals)++; if ((token == 'e') || (token == '}')) return n; /* nothing after terminal in expression */ leftn = n; if ((t = aparse_tree(buffer, len, bufptr, terminals, pnum_terminals)) == NULL) return NULL; n = (ParseTree *)malloc(sizeof(ParseTree)); n->op = (token == ';') ? ANDPAT : ORPAT ; n->type = INTERNAL; n->data.internal.left = leftn; n->data.internal.right = t; return n; default: fprintf(stderr, "parse error at offset %d\n", *bufptr); return NULL; } case 'e': /* can't happen as I always do a lookahead above and return current tree if e */ default: fprintf(stderr, "parse error at offset %d\n", *bufptr); return NULL; } } int asplit_pattern(APattern, AM, terminals, pnum_terminals, pAParse) CHAR *APattern; int AM; ParseTree terminals[]; int *pnum_terminals; ParseTree **pAParse; { int bufptr = 0, ret, i, j; if (is_complex_boolean(APattern, AM)) { AComplexBoolean = 1; *pnum_terminals = 0; if ((*pAParse = aparse_tree(APattern, AM, &bufptr, terminals, pnum_terminals)) == NULL) return -1; /* print_tree(*pAParse, 0); */ return *pnum_terminals; } else { for (i=0; i<AM; i++) { if (APattern[i] == '\\') i++; else if ((APattern[i] == '{') || (APattern[i] == '}')) { /* eliminate it from pattern by shifting (including '\0') since agrep must essentially do a flat search */ for (j=i; j<AM; j++) APattern[j] = APattern[j+1]; AM --; } } AComplexBoolean = 0; *pnum_terminals = 0; if ((ret = asplit_pattern_flat(APattern, AM, terminals, pnum_terminals, (int *)pAParse)) == -1) return -1; return ret; } } /* int dd(b, e) char *b, *e; { int i=0; extern int anum_terminals; extern char amatched_terminals[]; for(;i<anum_terminals; i++) printf("%d ", amatched_terminals[i]); putchar(':'); while (b != e) putchar(*b++); putchar('\n'); return 1; } */ /* fast interpreter for the tree using which terminals matched: array bound checks are not done: its recursive */ int eval_tree(tree, matched_terminals) ParseTree *tree; char matched_terminals[]; { int res; if (tree == NULL) { fprintf(stderr, "Eval on empty tree: returning true\n"); return 1; /* safety sake, but cannot happen! */ } if (tree->type == LEAF) return ((tree->op & NOTPAT) ? (!matched_terminals[tree->terminalindex]) : (matched_terminals[tree->terminalindex])); else if (tree->type == INTERNAL) { if ((tree->op & OPMASK) == ANDPAT) { /* sequential evaluation */ if ((res = eval_tree(tree->data.internal.left, matched_terminals)) != 0) res = eval_tree(tree->data.internal.right, matched_terminals); return (tree->op & NOTPAT) ? !res : res; } else { /* sequential evaluation */ if ((res = eval_tree(tree->data.internal.left, matched_terminals)) == 0) res = eval_tree(tree->data.internal.right, matched_terminals); return (tree->op & NOTPAT) ? !res : res; } } else { fprintf(stderr, "Eval on bad tree: returning false\n"); return 0; /* safety sake, but cannot happen! */ } } /* [first, last) = C-style range for which we want the words in terminal-values' patterns: 0..num_terminals for !ComplexBoolean, term/term otherwise */ asplit_terminal(first, last, pat_buf, pat_ptr) int first, last; char *pat_buf; int *pat_ptr; { int word_length; int type; int num_pat; *pat_ptr = 0; num_pat = 0; for (; first<last; first++) { word_length = strlen(aterminals[first].data.leaf.value); if (word_length <= 0) continue; if ((type = checksg(aterminals[first].data.leaf.value, D, 0)) == -1) return -1; if (!type) return -1; strcpy(&pat_buf[*pat_ptr], aterminals[first].data.leaf.value); pat_buf[*pat_ptr + word_length] = '\n'; pat_buf[*pat_ptr + word_length + 1] = '\0'; *pat_ptr += (word_length + 1); num_pat ++; if(num_pat >= MAXNUM_PAT) { fprintf(stderr, "Warning: too many words in pattern (> %d): ignoring...\n", MAXNUM_PAT); break; } } return num_pat; }